In [1]:

    
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib
from IPython.display import HTML
import requests
from datetime import datetime

Import stations CSV

My first step is to find interesting stations and one of the requirements is to find some that have a good time range coverage so I will evaluate the files downloaded.

Reading the stations coordinates CSV



In [2]:

    
stations = pd.read_csv(
    '../../data/ncdc/ish-history.csv',
    skiprows=1, 
    names=['usaf','wban','stname','ctry','fips','state','call','lat' ,'lon' ,'elev'],
    dtype=object)
print (len(stations))
stations[:3]









    



30567






    Out[2]:






  
    
      
      usaf
      wban
      stname
      ctry
      fips
      state
      call
      lat
      lon
      elev
    
  
  
    
      0
      000000
      99999
      NYGGBUKTA GREENLAND- STA
      GL
      GL
      NaN
      NaN
      +73483
      +021567
      +00030
    
    
      1
      000010
      99999
      JAN HAYEN
      NO
      NO
      NaN
      NaN
      +70983
      -007700
      +00229
    
    
      2
      000020
      99999
      ISFJORD RADIO SPITZBERGEN
      NO
      NO
      NaN
      NaN
      +78067
      +013633
      +00079

Get correct coordinates dividing the lat/lon by 1000 and the elevation by 10. Then generate a unique index and a column that specficies if the id is from a USAF station or from WBAN. Finally drop any station that doesn't have a location to use.



In [3]:

    
stations['lat'] = stations.lat.apply(lambda lat: float(lat)/1000)
stations['lon'] = stations.lon.apply(lambda lon: float(lon)/1000)
stations['elev'] = stations.elev.apply(lambda e: float(e)/10.0)



In [4]:

    
stations['id']     = stations.apply(lambda row : "{}-{}".format(row.usaf,row.wban),axis=1)
stations.set_index(['id'],inplace=True)
stations.dropna(how='any',subset=['lat','lon'],inplace=True)
stations.head()









    Out[4]:






  
    
      
      usaf
      wban
      stname
      ctry
      fips
      state
      call
      lat
      lon
      elev
    
    
      id
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      000000-99999
      000000
      99999
      NYGGBUKTA GREENLAND- STA
      GL
      GL
      NaN
      NaN
      73.483
      21.567
      3.0
    
    
      000010-99999
      000010
      99999
      JAN HAYEN
      NO
      NO
      NaN
      NaN
      70.983
      -7.700
      22.9
    
    
      000020-99999
      000020
      99999
      ISFJORD RADIO SPITZBERGEN
      NO
      NO
      NaN
      NaN
      78.067
      13.633
      7.9
    
    
      000030-99999
      000030
      99999
      BJORNOYA BARENTS SEA
      NO
      NO
      NaN
      NaN
      74.467
      19.283
      29.0
    
    
      000040-99999
      000040
      99999
      VAROO
      NO
      NO
      NaN
      NaN
      70.367
      31.100
      11.9



In [5]:

    
stations[(stations.usaf == '082840')]









    Out[5]:






  
    
      
      usaf
      wban
      stname
      ctry
      fips
      state
      call
      lat
      lon
      elev
    
    
      id
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      082840-99999
      082840
      99999
      VALENCIA/AEROPUERTO
      SP
      SP
      NaN
      LEVC
      39.5
      -0.467
      62

Read CSV of observations

Instantiate the path where the NCDC data has been downloaded



In [6]:

    
p = Path('../../data/ncdc')

Check if we have already the stations CSV or generate it from the files. To generate the CSV the name file will be inspected but also a full read of the file will be performed in order to count the number of observations on every station.



In [7]:

    
gsodCSV = p.joinpath('gsod.csv')
if not gsodCSV.exists():
    ops = p.joinpath('raw').joinpath('gsod').glob('**/*.op')
    agsod = []

    for op in ops:
        try:
            data = op.name.replace('.op','').split('-')
            data.append(sum(1 for line in op.open( encoding='utf8' ))-1)
            agsod.append(data)
        except UnicodeDecodeError:
          print (op.absolute())
    dfGsod = pd.DataFrame(data = agsod, columns=['usaf','wban','year','obs'])
    dfGsod['id'] = dfGsod.apply(lambda row: "{}-{}".format(row.usaf,row.wban) ,axis=1)
    dfGsod = dfGsod.set_index(['id'])
    dfGsod[['year','obs']].to_csv(str(gsodCSV))



In [8]:

    
print ('Reading existing stations per year CSV')
dfGsod = pd.read_csv(str(gsodCSV),index_col=['id'])
print ("{:,} station files".format(len(dfGsod)))
print ("{:,} observations".format(dfGsod.obs.sum()))
dfGsod.head()









    



Reading existing stations per year CSV
404,762 station files
111,648,951 observations






    Out[8]:






  
    
      
      year
      obs
    
    
      id
      
      
    
  
  
    
      033110-99999
      1931
      357
    
    
      105130-99999
      1931
      196
    
    
      103610-99999
      1931
      1
    
    
      030050-99999
      1931
      365
    
    
      122050-99999
      1931
      359

Year statistics per station

Now study this dataframe, grouping by id and see the total years recorded, max and min



In [9]:

    
year_groups = dfGsod[['year']].groupby(level=0)
year_count  = year_groups.count()
year_max    = year_groups.max()
year_min    = year_groups.min()

years = pd.concat([year_count,year_max,year_min],axis=1)
years.columns = ['count','max','min']
years.head()









    Out[9]:






  
    
      
      count
      max
      min
    
    
      id
      
      
      
    
  
  
    
      008209-99999
      1
      2009
      2009
    
    
      008210-99999
      1
      2009
      2009
    
    
      010010-99999
      41
      2009
      1955
    
    
      010013-99999
      3
      1988
      1986
    
    
      010014-99999
      24
      2009
      1986

Count observations per station



In [10]:

    
obs_groups = dfGsod[['obs']].groupby(level=0)
obs_count = obs_groups.sum()
obs_count.head()









    Out[10]:






  
    
      
      obs
    
    
      id
      
    
  
  
    
      008209-99999
      27
    
    
      008210-99999
      32
    
    
      010010-99999
      14180
    
    
      010013-99999
      335
    
    
      010014-99999
      6300



In [11]:

    
obs_count[(obs_count.index == '082840-99999')]









    Out[11]:






  
    
      
      obs
    
    
      id
      
    
  
  
    
      082840-99999
      13331

Join stations and observations statistics

Now we can check if the indexes of both data frames are unique and then join them to retreive only the stations with observations



In [12]:

    
stations.index.is_unique and years.index.is_unique and obs_count.index.is_unique









    Out[12]:





True



In [13]:

    
scdf = pd.concat([stations[(stations.index=='082840-99999')],years,obs_count],axis=1,join='inner')
scdf.head()









    Out[13]:






  
    
      
      usaf
      wban
      stname
      ctry
      fips
      state
      call
      lat
      lon
      elev
      count
      max
      min
      obs
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      082840-99999
      082840
      99999
      VALENCIA/AEROPUERTO
      SP
      SP
      NaN
      LEVC
      39.5
      -0.467
      62
      37
      2009
      1973
      13331

Finally we can study this dataset and filter the appropriate stations for our study on this first iteration



In [14]:

    
scdf.to_csv('stations_vlc.csv')

Check if the index of the imported dataframe is unique and then join it with our stations dataset. This join will keep only data on both data frames using the parameter join='inner'.



In [15]:

    
selections.index.is_unique









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-34069fcce249> in <module>()
----> 1 selections.index.is_unique

NameError: name 'selections' is not defined



In [16]:

    
scdf['koppen'] = 'BSk'

Getting the observations for the selected stations

Filter the observations data set (around 11M records) by the selected stations and generate the file list to read from the observations folder. First we join the stations with the counts, then we move the index to a column and finally we define the path as a concatenation of the year (on a folder) and the file name as the id and the year.



In [17]:

    
files_to_read = pd.merge(left=dfGsod,right=scdf,left_index=True,right_index=True,)[['year']]
files_to_read['id'] = files_to_read.index
files_to_read['path'] = files_to_read.apply(lambda row: Path('../../data/ncdc/raw/gsod').joinpath("{0}/{1}-{0}.op".format(row.year,row.id)),axis=1)
print ("{} files to read".format(len(files_to_read)))









    



37 files to read

Read the files defined previously and store the results on a new big data frame and CSV adding the Köppen classification. First we define a couple of helping functions and then we iterate over the files_to_read data frame to read the corresponding CSV and append it to the final CSV file.



In [18]:

    
def getId(stn,wban):
    try:
        istn = int(stn)
        iwban = int(wban)
        return "{:0>6}-{:0>5}".format(istn,iwban)
    except ValueError:
        print("{}/{}".format(stn,wban))
        
def getStationByStnWban(stn,wban):
    try:
        koppen = scdf.loc[getId(stn,wban)].koppen
    except KeyError:
        koppen = None
    return koppen



In [19]:

    
observationsCSV = p.joinpath('observations_vlc.csv')
if not observationsCSV.exists():
    i = 0
    acc = 0
    for index,row in files_to_read.iterrows():
        path = row['path']
        if path.exists():
            dfObsTemp = pd.read_fwf(
                    str(path),
                    colspecs=[(0,7),(7,13),(14,18),(18,22),(25,30),(31,33),(35,41),
                              (42,44),(46,52),(53,55),(57,63),(64,66),(68,73),(74,76),
                              (78,84),(84,86),(88,93),(95,100),(102,108),(108,109),
                              (110,116),(116,117),(118,123),(123,124),(125,130),(132,138)],
                    skiprows=1,
                    names=['stn','wban','year','monthday','temp','temp_count',
                           'dewp','dewp_count','slp','slp_count','stp','stp_count',
                           'visib','visib_count','wsdp','wsdp_count','mxspd',
                           'gust','max','max_flag','min','min_flag','prcp','prc_flag','sndp','frshtt']
                    )
            dfObsTemp['koppen'] = dfObsTemp.apply(lambda row: getStationByStnWban(row.stn,row.wban),axis=1)
            dfObsTemp.to_csv(str(observationsCSV),mode='a')

            i += 1
            acc += len (dfObsTemp)

            if i % 1000 == 0:
                print("{:>8} obs".format(acc))



In [20]:

    
print ('Reading observations CSV')
dfObs = pd.read_csv(str(observationsCSV),index_col=0)
dfObs = dfObs[(dfObs.stn != 'stn')]
print ("{:,} observations".format(len(dfObs)))
dfObs.head()









    



Reading observations CSV
13,331 observations






    Out[20]:






  
    
      
      stn
      wban
      year
      monthday
      temp
      temp_count
      dewp
      dewp_count
      slp
      slp_count
      ...
      gust
      max
      max_flag
      min
      min_flag
      prcp
      prc_flag
      sndp
      frshtt
      koppen
    
  
  
    
      0
      82840
      99999
      1991
      101
      52.5
      24
      46.5
      24
      9999.9
      0
      ...
      999.9
      61.5
      NaN
      44.6
      *
      0.0
      D
      999.9
      0
      BSk
    
    
      1
      82840
      99999
      1991
      102
      47.7
      24
      40.9
      24
      9999.9
      0
      ...
      999.9
      64.4
      *
      35.6
      *
      0.0
      D
      999.9
      0
      BSk
    
    
      2
      82840
      99999
      1991
      103
      47.0
      24
      40.0
      24
      9999.9
      0
      ...
      999.9
      63.3
      NaN
      34.7
      NaN
      0.0
      D
      999.9
      0
      BSk
    
    
      3
      82840
      99999
      1991
      104
      48.5
      24
      37.0
      24
      9999.9
      0
      ...
      20.0
      57.2
      NaN
      39.2
      NaN
      0.0
      D
      999.9
      0
      BSk
    
    
      4
      82840
      99999
      1991
      105
      47.0
      24
      34.2
      24
      9999.9
      0
      ...
      999.9
      59.0
      *
      34.2
      NaN
      0.0
      D
      999.9
      0
      BSk
    
  

5 rows × 27 columns

Performing data management operations on the dataset

Now that we have the raw dataset, we can start doing management operations. First generate a copy dataframe with only the columns we are interested in.



In [27]:

    
dfObs2 = dfObs.copy()[['stn','wban','year','monthday','temp','max','min','frshtt','koppen']]

Management

Generate an index using the id station and the date



In [28]:

    
def getDateTimeFromRow(row):
    try:
        iyear = int(row.year)
        imonth = int("{:0>4}".format(row.monthday)[0:2])
        iday = int("{:0>4}".format(row.monthday)[2:4])
        return  datetime(iyear,imonth,iday)
    except ValueError:
        return np.nan

dfObs2['id']   = dfObs2.apply(lambda row: getId(row.stn,row.wban) ,axis=1)
dfObs2['date'] = dfObs2.apply(lambda row : getDateTimeFromRow(row),axis=1)
dfObs2.set_index(['id','date'],inplace=True)

The frshtt column needs to be padded with zeros to get all the flags in the correct place. Then is possible to get the occurrence of different weather conditions



In [29]:

    
dfObs2['frshtt']  = dfObs2.apply(lambda row: "{:0>6}".format(row.frshtt),axis=1)
dfObs2['fog']     = dfObs2['frshtt'].apply(lambda row: row[0:1]=='1')
dfObs2['rain']    = dfObs2['frshtt'].apply(lambda row: row[1:2]=='1')
dfObs2['snow']    = dfObs2['frshtt'].apply(lambda row: row[2:3]=='1')
dfObs2['hail']    = dfObs2['frshtt'].apply(lambda row: row[3:4]=='1')
dfObs2['thunder'] = dfObs2['frshtt'].apply(lambda row: row[4:5]=='1')
dfObs2['tornado'] = dfObs2['frshtt'].apply(lambda row: row[5:6]=='1')

Recode the temperatures columns, replacing the NaN values and afterwards as numerics



In [47]:

    
dfObs2['tempC'] = dfObs2['temp'].replace('99.9', np.nan)
dfObs2['maxC']  = dfObs2['max'].replace('99.9', np.nan)
dfObs2['minC']  = dfObs2['min'].replace('99.9', np.nan)

dfObs2['tempC'] = dfObs2['tempC'].convert_objects(convert_numeric=True)
dfObs2['maxC']  = dfObs2['maxC'].convert_objects(convert_numeric=True)
dfObs2['minC']  = dfObs2['minC'].convert_objects(convert_numeric=True) 

def FtoC(f):
    return (f-32)*5/9

dfObs2['tempC'] = dfObs2['tempC'].apply(lambda temp: FtoC(temp))
dfObs2['maxC'] = dfObs2['maxC'].apply(lambda temp: FtoC(temp))
dfObs2['minC'] = dfObs2['minC'].apply(lambda temp: FtoC(temp))



In [49]:

    
dfObsTemp.to_csv(str(observationsCSV))

Frequency tables

Frequency tables for koppen, thunders, tornados that are categorized



In [26]:

    
dfObs.koppen.value_counts(normalize=True)*100









    Out[26]:





Cfb    4.206472
Cfa    4.092706
BSh    4.058425
Dfc    4.057249
Csa    4.037672
Aw     3.995913
ET     3.954154
Dsc    3.933317
BWh    3.926259
Csb    3.904413
Af     3.797369
Dfa    3.756619
Cfc    3.722338
BSk    3.693686
Dwb    3.629746
Dwa    3.600506
Dwc    3.593196
BWk    3.561688
Dfb    3.463550
Cwa    3.191236
Dfd    3.155610
Dwd    3.049323
Dsa    2.991263
Dsb    2.736005
Cwb    2.696851
EF     2.018542
Am     1.730095
Csc    1.673716
As     1.562219
Cwc    1.551464
Dsd    0.658396
dtype: float64



In [27]:

    
dfObs2.tornado.value_counts(normalize=True)*100









    Out[27]:





False    99.988069
True      0.011931
dtype: float64



In [28]:

    
dfObs2.thunder.value_counts(normalize=True)*100









    Out[28]:





False    94.976499
True      5.023501
dtype: float64

Categorize the temperatures by quantiles and then make the frequency table to confirm the categorization



In [29]:

    
dfObs2['temp4']=pd.qcut(dfObs2.tempC, 4, labels=["1=0%tile","2=25%tile","3=50%tile","4=75%tile"])
dfObs2['temp4'].value_counts(normalize=True)*100









    Out[29]:





1=0%tile     25.066503
2=25%tile    25.058521
3=50%tile    24.940639
4=75%tile    24.934169
dtype: float64

And to get the cuts, we can group by the categorization column and get the max value for every group



In [38]:

    
group_by_year= dfObs2[['temp4','tempC']].groupby(['temp4'])
group_by_year.max()



In [ ]:

	usaf	wban	stname	ctry	fips	state	call	lat	lon	elev
0	000000	99999	NYGGBUKTA GREENLAND- STA	GL	GL	NaN	NaN	+73483	+021567	+00030
1	000010	99999	JAN HAYEN	NO	NO	NaN	NaN	+70983	-007700	+00229
2	000020	99999	ISFJORD RADIO SPITZBERGEN	NO	NO	NaN	NaN	+78067	+013633	+00079

	usaf	wban	stname	ctry	fips	state	call	lat	lon	elev
id
000000-99999	000000	99999	NYGGBUKTA GREENLAND- STA	GL	GL	NaN	NaN	73.483	21.567	3.0
000010-99999	000010	99999	JAN HAYEN	NO	NO	NaN	NaN	70.983	-7.700	22.9
000020-99999	000020	99999	ISFJORD RADIO SPITZBERGEN	NO	NO	NaN	NaN	78.067	13.633	7.9
000030-99999	000030	99999	BJORNOYA BARENTS SEA	NO	NO	NaN	NaN	74.467	19.283	29.0
000040-99999	000040	99999	VAROO	NO	NO	NaN	NaN	70.367	31.100	11.9

	usaf	wban	stname	ctry	fips	state	call	lat	lon	elev
id
082840-99999	082840	99999	VALENCIA/AEROPUERTO	SP	SP	NaN	LEVC	39.5	-0.467	62

	year	obs
id
033110-99999	1931	357
105130-99999	1931	196
103610-99999	1931	1
030050-99999	1931	365
122050-99999	1931	359

	count	max	min
id
008209-99999	1	2009	2009
008210-99999	1	2009	2009
010010-99999	41	2009	1955
010013-99999	3	1988	1986
010014-99999	24	2009	1986

	stn	wban	year	monthday	temp	temp_count	dewp	dewp_count	slp	...	gust	max	max_flag	min	min_flag	prc_flag	sndp	koppen
0	82840	99999	1991	101	52.5	24	46.5	24	9999.9	...	999.9	61.5	NaN	44.6	*	D	999.9	BSk
1	82840	99999	1991	102	47.7	24	40.9	24	9999.9	...	999.9	64.4	*	35.6	*	D	999.9	BSk
2	82840	99999	1991	103	47.0	24	40.0	24	9999.9	...	999.9	63.3	NaN	34.7	NaN	D	999.9	BSk
3	82840	99999	1991	104	48.5	24	37.0	24	9999.9	...	20.0	57.2	NaN	39.2	NaN	D	999.9	BSk
4	82840	99999	1991	105	47.0	24	34.2	24	9999.9	...	999.9	59.0	*	34.2	NaN	D	999.9	BSk